Clustering with Missing Data
1 데이터와 패키지 불러오기
1.1 사용할 패키지와 소스
1.2 데이터 불러오기
dat = read.csv("/Users/iseunghun/course/결측데이터분석/project2/wine.data", header = F)
set.seed(1)
#V1 : Cluster Label (True Value) [59 vs 71 vs 48]
#V2 : Alcohol : 알코올 도수
#V3 : Malic acid : 말산
#V4 : Ash 회분
#V5 : Alcalinity of ash : 회분의 알칼리도
#V6 : Magnesium : 마그네슘
#V7 : Total phenols 페놀 총량
#V8 : Flavanoids : 플라보노이드
#V9 : Nonflavanoid phenols : 비 플라보노이드 페놀
#V10: Proanthocyanins : 프로안토시아니딘
#V11: Color intensity : 색채 강도
#V12: Hue : 색조
#V13: OD280/OD313 of diluted wines : 희석된 와인의 OD280/OD313 비율
#V14: Proline : 프롤린
# label 은 제거
dat2<-dat[,-1]2 create missing
어떤 그룹에 속하느냐에 따라서 missing 비율이 달라진다고 가정한다. 따라서 NMAR 가정에 가깝다. 결측을 발생시킬 변수들은 다음과 같이 그룹마다 차이가 큰 변수를 채택
## get PCs
PCA를 진행한 다음 가장 클러스터 레이벨과 연관성이 깊은 변수를 채택하여 결측발생에 이해한다. 이과정에서 MAR 의 성격이 다소 포함되지만 주로 클러스터 레이벨이 영향을 주고 있기 때문에 MNAR 에 가깝다고 할 수 있다.
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## V1 0.462 0.167 0.160 0.319 0.162 0.262 0.728
## V2 -0.323 0.401 0.310 -0.121 0.168 -0.698 -0.232 0.193
## V3 0.234 0.343 0.163 0.371 -0.482 -0.599 -0.188 0.146
## V4 0.448 -0.544 -0.339 -0.151 0.142 0.337 -0.433 0.209
## V5 0.357 0.207 -0.499 0.345 -0.406 0.483 -0.209
## V6 -0.257 0.247 -0.327 0.495 0.542 -0.436 -0.161
## V9 0.347 0.213 -0.483 0.225 -0.717 -0.152
## V10 -0.322 -0.304 0.277 -0.652 0.461 -0.216 0.173
## V11 0.533 0.314 0.140 0.509 0.197 0.311 -0.444
## V14 -0.456 0.229 -0.238 0.253 0.714 0.311
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
## Cumulative Var 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9
## Comp.10
## SS loadings 1.0
## Proportion Var 0.1
## Cumulative Var 1.0
2.1 13% missing data
2.1.1 create missing
##### 13% 미싱 그래프 (Red : Observed, Blue : Missing)
pca = princomp(scale(dat[,-c(7,8,12,13)]))
s <- pca$scores
dat13<-dat[,-1]
set.seed(1)
index1 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(2)
index2 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(5)
index3 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(7)
index4 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
dat13$V7[s[,10] >= quantile(s[,10],0.85) & index1 == 1]<-NA
dat13$V8[s[,10] >= quantile(s[,10],0.85) & index2 == 1]<-NA
dat13$V12[s[,10] >= quantile(s[,10],0.85) & index3 == 1]<-NA
dat13$V13[s[,10] >= quantile(s[,10],0.85) & index4 == 1]<-NA2.1.2 missing prop per cluster
## [1] 0.08474576
## [1] 0.1126761
## [1] 0.2916667
2.1.3 missing prop per var
## V2 V3 V4 V5 V6 V7 V8 V9
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.1348315 0.1348315 0.0000000
## V10 V11 V12 V13 V14
## 0.0000000 0.0000000 0.1348315 0.1404494 0.0000000
2.1.4 missing data pattern
## V2 V3 V4 V5 V6 V9 V10 V11 V14 V7 V8 V12 V13
## 151 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2
## 1 1 1 1 1 1 1 1 1 1 1 0 0 1 2
## 1 1 1 1 1 1 1 1 1 1 1 0 0 0 3
## 2 1 1 1 1 1 1 1 1 1 0 1 0 0 3
## 3 1 1 1 1 1 1 1 1 1 0 0 1 0 3
## 1 1 1 1 1 1 1 1 1 1 0 0 0 1 3
## 18 1 1 1 1 1 1 1 1 1 0 0 0 0 4
## 0 0 0 0 0 0 0 0 0 24 24 24 25 97
2.2 27% missing
##### 27% 미싱 그래프 (Red : Observed, Blue : Missing)
set.seed(1)
index1 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(2)
index2 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(5)
index3 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
set.seed(7)
index4 = sample(size = nrow(dat),x = c(0,1), replace = T,prob = c(0.13,0.85))
dat27<-dat[,-1]
dat27$V7[s[,10] >= quantile(s[,10],0.70) & index1 == 1]<-NA
dat27$V8[s[,10] >= quantile(s[,10],0.70) & index2 == 1]<-NA
dat27$V12[s[,10] >= quantile(s[,10],0.70) & index3 == 1]<-NA
dat27$V13[s[,10] >= quantile(s[,10],0.70) & index4 == 1]<-NA2.2.1 missing prop per cluster
## [1] 0.1864407
## [1] 0.2957746
## [1] 0.4583333
2.2.2 missing prop per var
## V2 V3 V4 V5 V6 V7 V8 V9
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.2696629 0.2640449 0.0000000
## V10 V11 V12 V13 V14
## 0.0000000 0.0000000 0.2696629 0.2865169 0.0000000
2.2.3 missing data patern
## V2 V3 V4 V5 V6 V9 V10 V11 V14 V8 V7 V12 V13
## 124 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## 2 1 1 1 1 1 1 1 1 1 1 1 0 0 2
## 5 1 1 1 1 1 1 1 1 1 1 0 0 0 3
## 1 1 1 1 1 1 1 1 1 1 0 1 0 1 2
## 3 1 1 1 1 1 1 1 1 1 0 1 0 0 3
## 6 1 1 1 1 1 1 1 1 1 0 0 1 0 3
## 2 1 1 1 1 1 1 1 1 1 0 0 0 1 3
## 35 1 1 1 1 1 1 1 1 1 0 0 0 0 4
## 0 0 0 0 0 0 0 0 0 47 48 48 51 194
스케일에 차이가 많이 나기 때문에, scale 한 다음에 k-means를 시도해야 함
3 mssing & obsereved 변수의 분포 (MCAR 아님을 확인)
missing index 지정해주는 코드
missing_V7_13 <- ifelse(is.na(dat13$V7)==0,"Observed","Missing")
missing_V8_13 <- ifelse(is.na(dat13$V8)==0,"Observed","Missing")
missing_V12_13 <- ifelse(is.na(dat13$V12)==0,"Observed","Missing")
missing_V13_13 <- ifelse(is.na(dat13$V13)==0,"Observed","Missing")
missing_V7_27 <- ifelse(is.na(dat27$V7)==0,"Observed","Missing")
missing_V8_27 <- ifelse(is.na(dat27$V8)==0,"Observed","Missing")
missing_V12_27 <- ifelse(is.na(dat27$V12)==0,"Observed","Missing")
missing_V13_27 <- ifelse(is.na(dat27$V13)==0,"Observed","Missing")4 13% missing (클러스터 고려하지 않음)
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
colnames(temp7) = c("Total_phenols","missing")
ggplot(temp7,mapping = aes(x= Total_phenols,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
colnames(temp8) = c("Flavanoids","missing")
ggplot(temp8,mapping = aes(x= Flavanoids,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
colnames(temp12) = c("Hue","missing")
ggplot(temp12,mapping = aes(x= Hue,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
colnames(temp13) = c("ratio","missing")
ggplot(temp13,mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)4.1 missing 27% (클러스터 고려하지 않음)
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
colnames(temp7) = c("Total_phenols","missing")
ggplot(temp7,mapping = aes(x= Total_phenols,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
colnames(temp8) = c("Flavanoids","missing")
ggplot(temp8,mapping = aes(x= Flavanoids,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
colnames(temp12) = c("Hue","missing")
ggplot(temp12,mapping = aes(x= Hue,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
colnames(temp13) = c("ratio","missing")
ggplot(temp13,mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)K-means 알고리듬의 핵심은 적절한 centroid를 찾을 수 있냐 이기 때문에 클러스터에 대한 정보를 이용하는 것이 더 타당하다. 클러스터를 고려치 않을시에는 오히려 MCAR 이어 보이지만 mean imputation을 진행하여 클러스터링을 진행할시 가장 성능이 좋지 않을것이다.
5 13% missing (클러스터 고려)
5.0.1 Cluster1
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp12[temp12$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)5.0.2 Cluster2
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)5.0.3 Cluster3
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_13
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_13
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_13
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_13
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)6 27% missing (클러스터 고려)
6.0.1 Cluster1
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp7) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 1,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)6.0.2 Cluster2
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 2,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)6.0.3 Cluster3
temp7 = data.frame(as.numeric(dat2_scaled$V7))
temp7$missing = missing_V7_27
temp7$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp7$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp8 = data.frame(as.numeric(dat2_scaled$V8))
temp8$missing = missing_V8_27
temp8$V1 <- dat$V1
colnames(temp8) = c("ratio","missing","V1")
ggplot(temp8[temp8$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp12 = data.frame(as.numeric(dat2_scaled$V12))
temp12$missing = missing_V12_27
temp12$V1 <- dat$V1
colnames(temp12) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)temp13 = data.frame(as.numeric(dat2_scaled$V13))
temp13$missing = missing_V13_27
temp13$V1 <- dat$V1
colnames(temp13) = c("ratio","missing","V1")
ggplot(temp13[temp13$V1 == 3,],mapping = aes(x= ratio,color =missing, fill = missing))+
geom_histogram(alpha = 0.2, position = "identity",binwidth = 0.5)클러스터 레이벨이 3>2>1 순으로, 그리고 missing 비율이 높아질수록 분포의 왜곡이 심해짐을 알 수 있다.
7 데이터 탐구 (EDA)
7.2 변수들의 분산 확인
## V2 V3 V4 V5 V6 V7
## 6.590623e-01 1.248015e+00 7.526464e-02 1.115269e+01 2.039893e+02 3.916895e-01
## V8 V9 V10 V11 V12 V13
## 9.977187e-01 1.548863e-02 3.275947e-01 5.374449e+00 5.224496e-02 5.040864e-01
## V14
## 9.916672e+04
스케일에 차이가 많이 나기 때문에, scale 한 다음에 k-means를 시도해야 함
7.3 변수 정규성 가정 만족 여부
###V2
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V2
## W = 0.9818, p-value = 0.02005
###V3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V3
## W = 0.88878, p-value = 2.946e-10
###V4
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V4
## W = 0.98395, p-value = 0.03868
###V5
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V5
## W = 0.99023, p-value = 0.2639
###V6
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V6
## W = 0.93833, p-value = 6.346e-07
###V7
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V7
## W = 0.97668, p-value = 0.004395
###V8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V8
## W = 0.95453, p-value = 1.679e-05
###V9
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V9
## W = 0.96252, p-value = 0.0001055
###V10
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V10
## W = 0.98072, p-value = 0.01445
###V11
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V11
## W = 0.94032, p-value = 9.229e-07
###V12
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V12
## W = 0.98134, p-value = 0.01743
###V13
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V13
## W = 0.94505, p-value = 2.316e-06
###V14
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Shapiro-Wilk normality test
##
## data: dat2_scaled$V14
## W = 0.93119, p-value = 1.741e-07
standardizing 하고나서, 정규성은 만족하지 않음. 변수변환도 시도해봤으나 큰 도움이 되지 못했다. 오히려 정규성 가정을 만족하지 않는 변수를 imputation 등을 하여 클러스터링을 했을때 성능이 어떻게 되는지 확인해 보는게 더 의미 있다고 판단함.
8 CLUSTERING
kmeans 알고리듬은 초깃값에 따라 성능에 영향을 많이 받는 문제가 있어 초깃값을 10개를사용했다.ㅏ
8.1 K-means (complete data)
true_label = dat$V1
complete <- kmeans(dat2_scaled, centers = 3, nstart =10 )
print(adjustedRandIndex(complete$cluster, true_label))## [1] 0.897495
## [1] 0.2322567
8.2 K-means refence model
fully observed variable 만을 이용하여 클러스터링 진행
8.2.1 dat13 & dat 27 동일
# use only complete variable
F_o = dat13[,colSums(is.na(dat13))==0]
init <- kmeans(scale(F_o), centers = 3, iter.max = 100000, nstart = 10)
print(adjustedRandIndex(init$cluster, true_label))## [1] 0.7310778
## [1] 0.1769191
최소한 0.731 보다는 높아야 함
먼저, 사전에 결측대체를 진행하지 않는 알고리듬의 성능을 보고자 한다.
8.3 KSC
8.3.1 dat13_scaled
가장 적합한 w (weight를 찾아줘야함) 이건 그냥 예시
## [1] 0.8350791
## [1] 0.1769191
8.4 K-POD
8.5 ClusterImpute
8.5.1 data13_scaled
A1<-ClustImpute(dat13_scaled,3)
stat <- cluster.stats(d = dist((dat2_scaled)),A1$cluster, dat$V1)
stat$wb## [1] 0.6600923
## [1] 0.1620794
## [1] 0.8486782
8.5.2 data27_scaled
A1<-ClustImpute(dat27_scaled,3)
stat <- cluster.stats(d = dist((dat2_scaled)),A1$cluster, dat$V1)
stat$wb## [1] 0.6600923
## [1] 0.1620794
## [1] 0.8486782
다음으로는 사전에 결측대체를 진행하고 클러스터링을 진행하는 방법의 성능을 보고자 한다. 이때 관건은 얼마나 그럴듯한 값으로 대체를 하냐일것이다. 이때, 표준화를 거친 후에 결측대체를 진행한 경우와, 표준화를 거치기 전에 결측대체를 진행하고 클러스터링을 실시한 경우로 나누어 생각했다. 대체로 전자의 경우 성능이 좋았다.
8.6 K-NN Imputation
8.6.1 dat13_scaled
dat13_scaled1<-knnImputation(dat13_scaled, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat13_scaled15<-knnImputation(dat13_scaled, k = 15, scale = T, meth = "weighAvg", distData = NULL)
S11<-kmeans(dat13_scaled1,3 , nstart =10); #S11$cluster
S11_stat <- cluster.stats(d = dist((dat2_scaled)), S11$cluster, dat$V1)
print(adjustedRandIndex(S11$cluster,true_label))## [1] 0.8486782
## [1] 0.1620794
S12<-kmeans(dat13_scaled15,3, nstart = 10 ); #S12$cluster
S12_stat <- cluster.stats(d = dist((dat2_scaled)), S12$cluster, dat$V1)
print(adjustedRandIndex(S12$cluster,true_label))## [1] 0.8652458
## [1] 0.1620794
8.6.2 dat27_scaled
dat27_scaled1<-knnImputation(dat27_scaled, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat27_scaled15<-knnImputation(dat27_scaled, k = 15, scale = T, meth = "weighAvg", distData = NULL)
S13<-kmeans(dat27_scaled1,3, nstart = 10); #S13$cluster
S13_stat <- cluster.stats(d = dist((dat2_scaled)), S13$cluster, dat$V1)
print(adjustedRandIndex(S13$cluster,true_label))## [1] 0.7423645
## [1] 0.1324693
S14<-kmeans(dat27_scaled15,3, nstart = 10 ); #S14$cluster
S14_stat <- cluster.stats(d = dist((dat2_scaled)), S14$cluster, dat$V1)
print(adjustedRandIndex(S14$cluster,true_label))## [1] 0.8353221
## [1] 0.1772105
8.6.3 dat13 not scaled imputation
dat13_1<-knnImputation(dat13, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat13_15<-knnImputation(dat13, k = 15, scale = T, meth = "weighAvg", distData = NULL)
S11<-kmeans(scale(dat13_1),3, nstart = 10); #S11$cluster
print(adjustedRandIndex(S11$cluster,true_label))## [1] 0.8653209
## [1] 0.1620794
S12<-kmeans(scale(dat13_15),3, nstart = 10); #S12$cluster
print(adjustedRandIndex(S12$cluster,true_label))## [1] 0.8483196
## [1] 0.2322567
8.6.4 dat27 not scaled impu + scale
dat27_1<-knnImputation(dat27, k = 1, scale = T, meth = "weighAvg", distData = NULL)
dat27_15<-knnImputation(dat27, k = 15, scale = T, meth = "weighAvg", distData = NULL)
S13<-kmeans(scale(dat27_1),3, nstart =10); #S13$cluster
print(adjustedRandIndex(S13$cluster,true_label))## [1] 0.7423645
## [1] 0.1324693
S14<-kmeans(scale(dat27_15),3, nstart = 10 ); #S14$cluster
print(adjustedRandIndex(S14$cluster,true_label))## [1] 0.8186897
## [1] 0.1772105
8.7 Mean-Imputation
8.7.1 dat13_scaled
## [1] 0.7763463
## [1] 0.1520128
8.7.2 dat27_scaled
## [1] 0.1605927
## [1] 0.7787078
8.7.3 dat13 not scaled impu + scale
## [1] 0.1537044
## [1] 0.6989163
8.8 Unconditional Hot-Deck
8.8.1 dat13_scaled
## [1] 0.7209152
## [1] 0.1537044
8.8.2 dat27_scaled
## [1] 0.1300351
## [1] 0.8376253
8.8.3 dat13 not scale impu + scale
## [1] 0.7920856
## [1] 0.1520128
8.9 MI with pmm
MI 의 경우 cluster label 이 매번 임의로 label 되는 점을 해결하기 위해서 함수를 작성
#모드를 저장하는 함수
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# KNN결과 보팅하는 함수
vote <- function(multiple_imputation_set){
#결과를 저장하기 위한 매트릭스
result = matrix(c(rep(0, nrow(dat13_scaled)*10)), nrow = 10)
group1 = 1:59
group2 = 60:130
group3 = 131:178
for (i in 1:10){
data = complete(multiple_imputation_set, i)
init <- kmeans(scale(data), centers = 3, iter.max = 100000, nstart = 10)
result[i,] = init$cluster
## 레이벨 뽑기
index1 = getmode(result[i,group1])
index2 = getmode(result[i,group2])
index3 = getmode(result[i,group3])
## 레이벨만 바꿔주기
result[i,result[i,]==index1] = 11
result[i,result[i,]==index2] = 12
result[i,result[i,]==index3] = 13
result[i,] = result[i,]-10
}
final_result = apply(result,2,getmode)
return(final_result)
}8.9.1 dat13_scaled
ppm 방법 사용해서 imputation set 10개를 만듬
## [1] 0.8639879
## [1] 0.1620794
8.9.2 dat27_scaled
## [1] 0.8188574
## [1] 0.1830305
8.9.3 dat13_scaled
ppm 방법 사용해서 imputation set 10개를 만듬
## [1] 0.8640004
## [1] 0.203264
8.10 MI with cart
8.10.1 dat13_scaled
## [1] 0.8321902
## [1] 0.1538608
8.10.2 dat27_scaled
## [1] 0.7566207
## [1] 0.1520128
8.11 MI with rf
8.11.1 dat13_scaled
## [1] 0.8637748
## [1] 0.1620794
8.11.2 dat27_scaled
## [1] 0.8651097
## [1] 0.2322567
8.12 MI with norm
8.12.1 dat13_scaled
## [1] 0.8008503
## [1] 0.1620794
8.12.2 dat27_scaled
## [1] 0.7556455
## [1] 0.192957
8.13 MI with norm.predict
8.13.1 dat13_scaled
## [1] 0.8651097
## [1] 0.2322567
8.13.2 dat27_scaled
## [1] 0.7406843
## [1] 0.1538608
8.13.3 dat13
## [1] 0.8651097
## [1] 0.2322567
8.14 MI with norm.nob
8.14.1 dat13_scaled
## [1] 0.8653209
## [1] 0.203264
8.14.2 dat27_scaled
## [1] 0.7699792
## [1] 0.1538608
8.14.3 dat13_scaled
## [1] 0.8486782
## [1] 0.203264
8.15 EM & DA
DA 를 이용한 imputation 과 같은 경우에는 초깃값에 따라 영향을 받는 경우가 많아 범위로 보는 것이 더 적절할 것이다. 뒤에서 시뮬레이션을 따로 다루고자 한다.
8.15.1 dat13_scaled
iter = seq(5,50,5)
result_da_13 = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat13_scaled))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat13_scaled)
da<-kmeans(ximp,3);# S31$cluster
result_da_13[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}8.15.2 dat27_scaled
iter = seq(5,50,5)
result_da_27 = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat27_scaled))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat27_scaled)
da<-kmeans(ximp,3);# S31$cluster
result_da_27[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}8.15.3 dat13
iter = seq(5,50,5)
result_da_13_nonscale = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat13))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat13)
da<-kmeans(scale(ximp),3);# S31$cluster
result_da_13_nonscale[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}
max(result_da_13_nonscale)## [1] 0.8820585
8.15.4 dat27
iter = seq(5,50,5)
result_da_27_nonscale = matrix(c(rep(0, 50*length(iter))), nrow = 50)
for (i in 1:50){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat27))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat27)
da<-kmeans(scale(ximp),3);# S31$cluster
result_da_27_nonscale[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}
max(result_da_27_nonscale)## [1] 0.8992963
9 Simulation
일부 알고리듬의 stability를 보고자 하여 여러 초깃값에 대해 ARI를 산출했다. 가로축은 초깃값을 뜻하고 세로축은 ARI를 뜻한다. 이때 kmeans를 활용하는 경우의 nstart option을 100으로 주었으므로, 초깃값에 따른 변동분은 순전히 imputation에 기인한다고 볼 수 있다고 생각한다.
9.1 EM
9.2 DA
9.2.1 dat13_scaled
iter = seq(5,50,10)
result_da_13 = matrix(c(rep(0, 100*length(iter))), nrow = 100)
for (i in 1:100){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat13_scaled))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat13_scaled)
da<-kmeans(ximp,3, nstart = 100);# S31$cluster
result_da_13[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}da의 steps option = 5
index =1:100
result_da_13 = data.frame(result_da_13, index)
ggplot(result_da_13)+
geom_line(mapping = aes(x= index, y = X1), color = "red")+
geom_point(mapping = aes(x= index, y = X1), color = "red")da의 steps option = 50
index =1:100
result_da_13 = data.frame(result_da_13, index)
ggplot(result_da_13)+
geom_line(mapping = aes(x= index, y = X5), color = "orange")+
geom_point(mapping = aes(x= index, y = X5), color = "orange")9.2.2 dat27_scaled
iter = seq(5,50,10)
result_da_27 = matrix(c(rep(0, 100*length(iter))), nrow = 100)
for (i in 1:100){
for (j in 1:length(iter)){
rngseed(i)
s<- prelim.norm(as.matrix(dat27_scaled))
thetahat <- em.norm(s,showits = F)
theta <- da.norm(s, thetahat, steps = iter[j], showits = F)
#getparam.norm(s, theta)
ximp <- imp.norm(s, theta, dat27_scaled)
da<-kmeans(ximp,3, nstart = 100 );# S31$cluster
result_da_27[i,j]=adjustedRandIndex(da$cluster,true_label)
}
}da의 steps option = 5
index =1:100
result_da_27 = data.frame(result_da_27, index)
ggplot(result_da_27)+
geom_line(mapping = aes(x= index, y = X1), color = "red")+
geom_point(mapping = aes(x= index, y = X1), color = "red")da의 steps option = 50
index =1:100
result_da_27 = data.frame(result_da_27, index)
ggplot(result_da_27)+
geom_line(mapping = aes(x= index, y = X5), color = "orange")+
geom_point(mapping = aes(x= index, y = X5), color = "orange")9.3 Unconditional Hot-Deck
9.4 norm.nob
9.5 KSC
9.5.1 dat13_scaled
코드 돌린 파일 있어서 안돌림 ksc_search_grid13, ksc_search_grid13
result13_ksc = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid13.csv")[,-1]
result13_ksc_object = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid13_object.csv")[,-1]
max(result13_ksc)## [1] 0.8350791
최대 성능은 0.835
weight 파라미터에 따른 adj index(초기치에 대한 영향력을 줄이기 위해 웨이트 마다 10개 초기치에 대해서 평균을 내림 )
mean_adjrandindex = apply(result13_ksc,1,mean)
w = seq(0.2,0.95,by = 0.05)
temp = data.frame(w, mean_adjrandindex)
ggplot(temp)+
geom_line(mapping = aes(x= w, y = mean_adjrandindex))+
geom_point(mapping = aes(x= w, y = mean_adjrandindex)) 0.5가 최적의 파라미터란 사실을 알 수 있음
9.5.2 dat27_scaled
코드 돌린 파일 있어서 안돌림
result27_ksc = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid27.csv")[,-1]
result27_ksc_object = read.csv("/Users/iseunghun/course/결측데이터분석/project2/ksc_search_grid27_object.csv")[,-1]
max(result27_ksc)## [1] 0.8350791
최대는 0.835
mean_adjrandindex = apply(result27_ksc,1,mean)
w = seq(0.2,0.95,by = 0.05)
temp = data.frame(w, mean_adjrandindex)
ggplot(temp)+
geom_line(mapping = aes(x= w, y = mean_adjrandindex))+
geom_point(mapping = aes(x= w, y = mean_adjrandindex))w = 0.5가 최적이었다.
9.5.3 loss value 와 ARI간의 관계
KSC 의 경우 k-means 방법을 기본으로 사용하기 때문에 클러스터 centroid 의 initial 값을 어디에 주냐에 따라서 성능이 달라진다. 보통의 k-means는 loss value 가 가장 낮았던 초깃값을 사용하면 되지만 KSC에서는 가장 loss value 가 작았던 초깃값이 ARI가 가장 높은 초깃값과 일치하는지는 밝혀지지 않아 확인해야 할 필요성을 느꼈다.
13% missing의 경우
## [1] 8 1 8 10 1 1 1 1 1 7 1 1 1 5 1 1
## [1] 2 1 1 10 7 5 5 1 1 7 2 1 3 5 2 8
27% missing의 경우
## [1] 4 4 5 2 10 1 5 1 1 1 1 1 1 5 1 1
## [1] 4 1 1 6 1 10 5 1 1 1 7 2 7 9 3 2
확인 결과 꼭 일치하지 않는다는 사실을 알 수 있었다. 게다가 KSC에서는 w 하이퍼 파라미터를 결정해야 하는 문제가 있다.(어떤 값을 쓰는냐에 따라서 성능차이가 심하게 발생한다.) 이론상으로는 결측이 발생한 변수가 클러스터링에 중요할수록 w 를 높이면 되지만 정확한 값을 얻기 어려울 뿐더러 이런 정보가 항상 가능한것도 아니다. 또한 클러스터 레이벨이 있는 데이터셋의 일부만 가지고 w 를 결정하는 것도 가능은 하겠지만, 이는 클러스터링 보다는 분류문제에 가깝지 않나 생각한다.(즉 실용성이 없다.)
9.6 K-pod
9.6.1 13% missing
result_kpod13 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
set.seed(i)
B1<-kpod(as.matrix(dat13_scaled),3,maxiter = 1000)
result_kpod13[i,] = adjustedRandIndex(B1$cluster, true_label)
}result_kpod13 = data.frame(result_kpod13, index)
ggplot(result_kpod13)+
geom_line(mapping = aes(x= index, y = result_kpod13))+
geom_point(mapping = aes(x= index, y = result_kpod13))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7136 0.7921 0.7921 0.7772 0.7921 0.7921
9.6.2 27% missing
result_kpod27 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
set.seed(i)
B1<-kpod(as.matrix(dat27_scaled),3)
result_kpod27[i,] = adjustedRandIndex(B1$cluster, true_label)
}result_kpod27 = data.frame(result_kpod27, index)
ggplot(result_kpod27)+
geom_line(mapping = aes(x= index, y = result_kpod27))+
geom_point(mapping = aes(x= index, y = result_kpod27))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7787 0.7787 0.7787 0.7787 0.7787 0.7787
9.7 ClustImpute
9.7.1 13% missing
result_clustimpute13 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
set.seed(i)
A1<-ClustImpute(dat13_scaled,3)
result_clustimpute13[i,] = adjustedRandIndex(A1$cluster, true_label)
}result_clustimpute13 = data.frame(result_clustimpute13, index)
ggplot(result_clustimpute13)+
geom_line(mapping = aes(x= index, y = result_clustimpute13))+
geom_point(mapping = aes(x= index, y = result_clustimpute13))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8487 0.8487 0.8487 0.8487 0.8487 0.8487
9.7.2 27% missing
result_clustimpute27 = matrix(rep(0,100), ncol = 1)
for (i in 1:100){
set.seed(i)
A1<-ClustImpute(dat27_scaled,3)
result_clustimpute27[i,] = adjustedRandIndex(A1$cluster, true_label)
}result_clustimpute27 = data.frame(result_clustimpute27, index)
ggplot(result_clustimpute27)+
geom_line(mapping = aes(x= index, y = result_clustimpute27))+
geom_point(mapping = aes(x= index, y = result_clustimpute27))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8487 0.8487 0.8487 0.8487 0.8487 0.8487
imputation에 기반한 방법론들은 imputation 값에 따라서 클러스터링 성능에 편차가 심하게 난다. 그 반면에 사전에 imputation을 짆행하지 않는 방법들의 경우 거의 동일한 클러스터링 성능을 보인다.
10 시각화 예시
인덱스 맞춰주는 함수
group1 = 1:59
group2 = 60:130
group3 = 131:178
unite <- function(result){
index1 = getmode(result[group1])
index2 = getmode(result[group2])
index3 = getmode(result[group3])
## 레이벨만 바꿔주기
result[result==index1] = 11
result[result==index2] = 12
result[result==index3] = 13
result = result-10
return(result)
}10.1 TRUE
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,dat$V1)
colnames(ho) = c("PC1","PC2","cluster")
ggplot(ho)+
geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))10.2 mean imputation
10.2.1 13% missing
##
## iter imp variable
## 1 1 V7 V8 V12 V13
## 2 1 V7 V8 V12 V13
## 3 1 V7 V8 V12 V13
## 4 1 V7 V8 V12 V13
## 5 1 V7 V8 V12 V13
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(S21$cluster))
colnames(ho) = c("PC1","PC2","cluster")
ggplot(ho)+
geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))10.2.2 27% missing
##
## iter imp variable
## 1 1 V7 V8 V12 V13
## 2 1 V7 V8 V12 V13
## 3 1 V7 V8 V12 V13
## 4 1 V7 V8 V12 V13
## 5 1 V7 V8 V12 V13
pca = prcomp(dat2_scaled)$x[,c(1,2)]
ho = data.frame(pca,unite(S21$cluster))
colnames(ho) = c("PC1","PC2","cluster")
ggplot(ho)+
geom_point(mapping = aes( x= PC1, y =PC2 , color = as.factor(cluster)))11 imputation 체크
성능이 좋았던 imputation 방법과 안좋았던 imputation 방법의 bias 가 어떻게 나타나는지를 보고자 함.
11.1 Bias check (그룹을 고려하지 않음)
centering 했기 때문에 true 값은 0 이 나올것
11.1.1 mean
11.1.1.1 13% missing
## [1] -1.026994e-16
## [1] 1.675701e-16
## [1] -1.936787e-16
## [1] -3.761207e-17
11.1.2 KNN - 15
11.1.2.1 13% missing
## [1] -0.01568219
## [1] -0.01012611
## [1] -0.02413975
## [1] -0.007041984
11.1.3 Unconditional
11.1.3.1 13% missing
## [1] 0.007278275
## [1] 0.0007385788
## [1] 0.01494679
## [1] 0.00112046
11.1.4 pmm
11.1.4.1 13% missing
V7
## [1] 0.1326244
V8
## [1] 0.1853244
V12
## [1] 0.07837746
V13
## [1] -0.06703638
11.2 Bias check 그룹 별로
11.2.1 13%
11.2.1.1 V7
## V1 mean
## 1 0 0.08667527
## 2 0 0.09687105
## 3 0 -0.24982678
## V1 mean
## 1 0 0.01088159
## 2 0 0.06770110
## 3 0 -0.05536171
## V1 mean
## 1 0 0.09247370
## 2 0 0.05079736
## 3 0 -0.21579363
## mean
## 1 -0.017769185
## 2 -0.005051854
## 3 -0.019867709
11.2.1.2 V8
## V1 mean
## 1 0 0.10910734
## 2 0 0.08639061
## 3 0 -0.26189721
## V1 mean
## 1 0 0.01785856
## 2 0 0.04789199
## 3 0 -0.05524040
## V1 mean
## 1 0 0.1104413
## 2 0 0.1101193
## 3 0 -0.3013746
## mean
## 1 0.006408273
## 2 -0.031407916
## 3 -0.030143744
11.2.1.3 V12
## V1 mean
## 1 0 0.08744112
## 2 0 0.06440151
## 3 0 -0.20274028
## V1 mean
## 1 0 0.05406414
## 2 0 0.03009380
## 3 0 -0.02144935
## V1 mean
## 1 0 0.1128722
## 2 0 0.0170909
## 3 0 -0.2194467
## mean
## 1 0.04600703
## 2 -0.01730175
## 3 -0.06002310
11.2.1.4 V13
## V1 mean
## 1 0 0.10658423
## 2 0 0.07686309
## 3 0 -0.24470310
## V1 mean
## 1 0 0.04864109
## 2 0 0.01561124
## 3 0 -0.05676560
## V1 mean
## 1 0 0.11344062
## 2 0 0.08167337
## 3 0 -0.26440100
## mean
## 1 0.042386005
## 2 -0.014768742
## 3 -0.005394709
11.2.2 27% missing
11.2.3 V7
## V1 mean
## 1 0 0.1860152
## 2 0 0.1007617
## 3 0 -0.3776870
## V1 mean
## 1 0 0.04017108
## 2 0 0.08235247
## 3 0 -0.11219485
## V1 mean
## 1 0 0.2094141
## 2 0 0.1466471
## 3 0 -0.4057830
## mean
## 1 0.01491486
## 2 -0.01320186
## 3 -0.00200316
11.2.4 V8
## V1 mean
## 1 0 0.22061726
## 2 0 0.09301439
## 3 0 -0.40875917
## V1 mean
## 1 0 0.05941421
## 2 0 0.02856339
## 3 0 -0.12524427
## V1 mean
## 1 0 0.30228609
## 2 0 0.02684983
## 3 0 -0.50210273
## mean
## 1 0.03966937
## 2 -0.09766230
## 3 -0.05193020
11.2.5 V12
## V1 mean
## 1 0 -0.2760183
## 2 0 0.4737709
## 3 0 -0.3615137
## V1 mean
## 1 0 0.04579164
## 2 0 0.08184790
## 3 0 -0.09159167
## V1 mean
## 1 0 0.1937087
## 2 0 0.1254466
## 3 0 -0.3483074
## mean
## 1 -0.0181769789
## 2 -0.0003994343
## 3 -0.1233341437
11.2.6 V13
## V1 mean
## 1 0 0.1908502
## 2 0 0.1410300
## 3 0 -0.4431936
## V1 mean
## 1 0 0.04988614
## 2 0 0.02343661
## 3 0 -0.13928882
## V1 mean
## 1 0 0.2478954
## 2 0 0.0542209
## 3 0 -0.5425555
## mean
## 1 0.064558642
## 2 -0.001638195
## 3 -0.096772225
단순한 알고리듬일수록, 그리고 cluster3,2,1 으로 갈수록 cluster centroid 의 bias 가 심해진다는 것을 알 수 있었다.